honey <- import(here("data", "honeyproduction.csv"), setclass = "tbl_df")
regions <- import(here("data", "us_census_bureau_regions_and_divisions.csv"), setclass = "tbl_df") %>%
rename(state = `State Code`, `state_complete` = State) %>%
rename_with(tolower)
honey_full <- left_join(honey, regions, by = "state")
#Change scale of total production
honey_full$totalprod <- honey_full$totalprod/1000000
total_prod <- honey_full %>%
select(state_complete, totalprod, year) %>%
#group_by(state_complete) %>%
mutate(state_complete = as.factor(state_complete))
We made several attempts to show the change in growth over time. The first version was a heatmap. Highlighting the state data with ‘gghighlight’ obcures any data fluctuations for said state, over years, thus, we only highlighted the labels for the Western states.
states <- unique(total_prod$state_complete)
states <- sort(states)
label_color <- ifelse(states == "Oregon", "#03b800",
ifelse(states == "Washington", "#4714ff",
ifelse(states == "California", "#ffbc1f",
"gray30")))
label_face <- ifelse(states == "Oregon" | states == "Washington" | states == "California",
"bold",
"plain")
p1 <- ggplot(total_prod, aes(year, state_complete)) +
geom_tile(aes(fill = totalprod)) +
labs(y = "U.S. States",
fill = "Total production,
in millions of lbs.",
title = "U.S. Honey Production",
subtitle = "Total production of honey across years by state",
caption = "Data: #tidytuesday") +
#gghighlight(state_complete %in% c("California", "Oregon", "Washington")) +
theme(axis.text.y = element_text(color = label_color,face = label_face))+
scale_fill_viridis_c(option = "magma")
## Warning: Vectorized input to `element_text()` is not officially supported.
## Results may be unexpected or may change in future versions of ggplot2.
p1
ggplot(honey_full, aes(state, year, fill = totalprod))+
geom_tile(color = "white", size = 0.25) +
labs(title = "U.S. Honey Production",
subtitle = "Total production of honey across years by state",
caption = "Data: #tidytuesday") +
scale_y_continuous(breaks = seq(1998, 2012, 1), expand = c(0, 0))+
guides(fill = guide_legend(title = "Total Production,
in millions of lbs.",
label.position = "bottom",
label.hjust = 1,
keywidth = 4,
keyheight = .8))+
theme(panel.grid.major = element_blank(),
axis.title = element_blank(),
axis.text.x = element_text(color = label_color,face = label_face))
## Warning: Vectorized input to `element_text()` is not officially supported.
## Results may be unexpected or may change in future versions of ggplot2.
Another way to vizualize the change over time with highlighting the western states, is to have a line plot, with all the states besides Western ones being in the background. Below are versions with 3 Western states together, and with each of them faceted individually.
ggplot(total_prod, aes(year, totalprod, color = state_complete)) +
geom_line(size = 1)+
gghighlight(state_complete %in% c("California", "Oregon", "Washington"),
unhighlighted_params = list(size = 0.5, colour = alpha("grey20", 0.2))) +
labs(y = "Total production, in millions of lbs.",
title = "U.S. Honey Production",
subtitle = "Total production of honey across years by state",
caption = "Data: #tidytuesday")
## Warning: Tried to calculate with group_by(), but the calculation failed.
## Falling back to ungrouped filter operation...
p2 <- ggplot(total_prod, aes(year, totalprod, color = state_complete)) +
geom_line(size = 1.5)+
facet_wrap(~state_complete) +
gghighlight(state_complete %in% c("California", "Oregon", "Washington"),
unhighlighted_params = list(size = 0.5, colour = alpha("grey20", 0.2))) +
labs(y = "Total production, in millions of lbs.",
title = "U.S. Honey Production",
subtitle = "Total production of honey across years by state",
caption = "Data: #tidytuesday")
## Warning: Tried to calculate with group_by(), but the calculation failed.
## Falling back to ungrouped filter operation...
p2
We also made a line plot for each state. Besides highlighting the line for each Western state by different color, we tried using the Label of the state printed differently as well. However, ggplot was not accepting a vector of colors for ‘element_text()’, and none of the troubleshooting helped.
states <- unique(total_prod$state_complete)
label_color <- ifelse(states == "Oregon", "#03b800",
ifelse(states == "Washington", "#4714ff",
ifelse(states == "California", "#ffbc1f",
"gray30")))
label_face <- ifelse(states == "Oregon" | states == "Washington" | states == "California",
"bold",
"plain")
total_prod %>%
spread(state_complete, totalprod) %>%
gather(state_complete, totalprod, -year) %>%
ggplot(aes(year, totalprod)) +
geom_line(colour = "grey40", size = 1.5) +
geom_line(data = filter(total_prod, str_detect(state_complete, "California")),
color = "#ffbc1f") +
geom_line(data = filter(total_prod, str_detect(state_complete, "Oregon")),
color = "#03b800") +
geom_line(data = filter(total_prod, str_detect(state_complete, "Washington")),
color = "#4714ff") +
facet_wrap(~state_complete) +
theme(strip.text = element_text(color = label_color, face = label_face)) +
labs(y = "Total production, in millions of lbs.",
title = "U.S. Honey Production",
subtitle = "Total production of honey across years by state",
caption = "Data: #tidytuesday")
## Warning: Vectorized input to `element_text()` is not officially supported.
## Results may be unexpected or may change in future versions of ggplot2.
Finally, we have tried creating the dumbbell plot to show the difference between 1998 and 2012 in honey production. That plot is not ideal, since it only shows the change between the initial and final points, without reflecting any fluctuations in between.
dumbbell <- total_prod %>%
mutate(year = as.character(year),
state_complete = as.factor(state_complete))
dumbbell <- dumbbell %>%
filter(year %in% c("1998", "2012"))
ggplot(dumbbell, aes(totalprod, state_complete)) +
geom_line(aes(group = state_complete), color = "gray40") +
geom_line(data = filter(dumbbell, str_detect(state_complete, "California")),
color = "#ffbc1f") +
geom_line(data = filter(dumbbell, str_detect(state_complete, "Oregon")),
color = "#015200") +
geom_line(data = filter(dumbbell, str_detect(state_complete, "Washington")),
color = "#2600ad") +
geom_point(aes(color = year)) +
labs(x = "Total production, in millions of lbs.",
y = "U.S. state",
title = "U.S. Honey Production",
subtitle = "Change in production between 1998 and 2012 by state",
caption = "Data: #tidytuesday") +
theme(axis.text.y = element_text(color = label_color, face = label_face))
## Warning: Vectorized input to `element_text()` is not officially supported.
## Results may be unexpected or may change in future versions of ggplot2.
We tried to reproduce the color-blindless-friendly versions for one of the heatmaps and one of the faceted plots. The heatmap uses sequential palettes, while the faceted plot uses the qualitatiove one. The heatmap plot allows the better illustration of the gradation of change over years, and let the different palettes show, while the faceted plot allows for a more effective highlighting of the western states.
colorblindr::cvd_grid(p1)
colorblindr::cvd_grid(p2)
#reminder which sequential palettes are color blind safe
#display.brewer.all(type="seq", colorblindFriendly = TRUE)
p3 <- ggplot(total_prod, aes(year, state_complete)) +
geom_tile(aes(fill = totalprod)) +
labs(y = "U.S. States",
fill = "Total production,
in millions of lbs.",
title = "U.S. Honey Production",
subtitle = "Total production of honey across years by state",
caption = "Data: #tidytuesday") +
theme(axis.text.y = element_text(color = label_color,face = label_face)) +
scale_fill_continuous_sequential("Blues", begin = 0.25, end = 1)
## Warning: Vectorized input to `element_text()` is not officially supported.
## Results may be unexpected or may change in future versions of ggplot2.
p3
#to check
colorblindr::cvd_grid(p3)
p4 <- ggplot(total_prod, aes(year, totalprod, color = state_complete)) +
geom_line(size = 1.5)+
facet_wrap(~state_complete) +
gghighlight(state_complete %in% c("California", "Oregon", "Washington"),
unhighlighted_params = list(size = 0.5, colour = alpha("grey20", 0.2))) +
scale_color_OkabeIto() +
labs(y = "Total production, in millions of lbs.",
title = "U.S. Honey Production",
subtitle = "Total production of honey across years by state",
caption = "Data: #tidytuesday")
## Warning: Tried to calculate with group_by(), but the calculation failed.
## Falling back to ungrouped filter operation...
p4
#to check
colorblindr::cvd_grid(p4)
(Note: I tried out two different qualitative color palettes for this bar plot.)
It appears that, on average, states in the Midwestern and Western regions of the US produce the most honey and that states in the Northeast region produce the least. States from the Southern region of the US also appear to produce less overall, with the exception of Florida. The states with the most honey production are fairly large in terms of geographic area. It would be interesting to see if there is a relationship between geographic area of a state and honey production (e.g., smaller states are associated with lower honey production). I’m also curious if there is a certain crop grown in North and South Dakota that commercial honey bees forage on (e.g., clover) and if there is a relationship between prevalence of wind-pollinated crops (e.g., corn, wheat, grass seed) and lower commercial honey production.
# load data
honey <- import(here("data", "honeyproduction.csv"), setclass = "tbl_df")
regions <- import(here("data", "us_census_bureau_regions_and_divisions.csv"), setclass = "tbl_df") %>%
rename(state = `State Code`, `state_complete` = State) %>%
rename_with(tolower)
# join honey and regions
honey_regions <- left_join(honey, regions, by = "state")
# evaluated if there were equal observations for each state
state_obs <- honey_regions %>%
group_by(state) %>%
summarize(n = n())
# calculate average production across years (raw and in millions), grouping by state and including region
honey_st_avg <- honey_regions %>%
group_by(state_complete, region) %>%
summarize(state_avg_mil = (mean(totalprod)/1000000),
state_avg = mean(totalprod))
# testing out okabe ito
honey_st_avg %>%
ggplot(aes(x = state_avg_mil,
y = fct_reorder(state_complete, state_avg_mil),
fill = region)) +
geom_col(alpha = .9) +
scale_fill_OkabeIto() +
scale_x_continuous(expand = c(0, 0),
breaks = c(0, 10, 20, 30),
labels = c("0", "10 million", "20 million", "30 million")) +
labs(title = "US Honey Production by State",
subtitle = "1998 to 2012",
y = "State",
x = "Average Yearly Production (Pounds)",
caption = "Source: #tidytuesday",
fill = "Region") +
theme_minimal() +
theme(plot.title.position = "plot",
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank())
# v2 with 'dark 2'
honey_st_avg %>%
ggplot(aes(x = state_avg_mil,
y = fct_reorder(state_complete, state_avg_mil),
fill = region)) +
geom_col(alpha = .9) +
scale_fill_discrete_qualitative("dark 2") +
scale_x_continuous(expand = c(0, 0),
breaks = c(0, 10, 20, 30),
labels = c("0", "10 million", "20 million", "30 million")) +
labs(title = "US Honey Production by State",
subtitle = "1998 to 2012",
y = "State",
x = "Average Yearly Production (Pounds)",
caption = "Source: #tidytuesday",
fill = "Region") +
theme_minimal() +
theme(plot.title.position = "plot",
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank())
# calculate average production across years (raw and in millions), grouping by state and including region
honey_avg_region <- honey_regions %>%
group_by(region, year) %>%
summarize(reg_avg_mil = (mean(totalprod)/1000000),
reg_avg = mean(totalprod))
honey_avg_region %>%
ggplot(aes(year,
fct_reorder(region, reg_avg_mil))) +
geom_tile(aes(fill = reg_avg_mil)) +
scale_fill_continuous_sequential(palette = "Red-Yellow", breaks = c(2, 4, 6, 8), labels = c("2 million", "4 million", "6 million", "8 million")) +
scale_x_continuous(expand = c(0, 0), breaks = c(1998, 2003, 2008, 2012)) +
labs(title = "US Regional Honey Production",
subtitle = "1998 to 2012",
y = "Region",
x = "Year",
caption = "Source: #tidytuesday",
fill = "Average Production (Pounds)") +
theme_minimal() +
theme(plot.title.position = "plot",
panel.grid = element_blank())
us <- usa_sf()
us <- rename(us, state = iso_3166_2)
df <- right_join(honey, us)
df_tib <- as_tibble(df)
ggplot(data = df_tib, aes(geometry = geometry,
fill = priceperlb),
alpha = 0.9) +
geom_sf(color = "white", size = 0) +
guides(color = "none") +
facet_wrap(~year) +
scale_fill_viridis(name = "Price per pound",
option = "magma",
label = scales::dollar,
limits = c(0,5)) +
labs(title = "Price of honey per pound from 1998-2012",
caption = "Source: #tidytuesday") +
theme_void()